From 1666086b00442b23e4fd70f4971e3bcf1a16b124 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 30 Sep 2022 15:16:22 +0200 Subject: [PATCH] x86/NUMA: improve memnode_shift calculation for multi node system MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit SRAT may describe individual nodes using multiple ranges. When they're adjacent (with or without a gap in between), only the start of the first such range actually needs accounting for. Furthermore the very first range doesn't need considering of its start address at all, as it's fine to associate all lower addresses (with no memory) with that same node. For this to work, the array of ranges needs to be sorted by address - adjust logic accordingly in acpi_numa_memory_affinity_init(). Signed-off-by: Jan Beulich Acked-by: Roger Pau Monné --- xen/arch/x86/numa.c | 3 ++- xen/arch/x86/srat.c | 32 ++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c index 4f742414b0..2c3c1c15fe 100644 --- a/xen/arch/x86/numa.c +++ b/xen/arch/x86/numa.c @@ -127,7 +127,8 @@ static int __init extract_lsb_from_nodes(const struct node *nodes, epdx = paddr_to_pdx(nodes[i].end - 1) + 1; if ( spdx >= epdx ) continue; - bitfield |= spdx; + if ( i && (!nodeids || nodeids[i - 1] != nodeids[i]) ) + bitfield |= spdx; if ( !i || !nodeids || nodeids[i - 1] != nodeids[i] ) nodes_used++; if ( epdx > memtop ) diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c index b62a152911..fbcd8749c4 100644 --- a/xen/arch/x86/srat.c +++ b/xen/arch/x86/srat.c @@ -312,6 +312,7 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) unsigned pxm; nodeid_t node; unsigned int i; + bool next = false; if (srat_disabled()) return; @@ -413,14 +414,37 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma) node, pxm, start, end - 1, ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : ""); - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = node; + /* Keep node_memblk_range[] sorted by address. */ + for (i = 0; i < num_node_memblks; ++i) + if (node_memblk_range[i].start > start || + (node_memblk_range[i].start == start && + node_memblk_range[i].end > end)) + break; + + memmove(&node_memblk_range[i + 1], &node_memblk_range[i], + (num_node_memblks - i) * sizeof(*node_memblk_range)); + node_memblk_range[i].start = start; + node_memblk_range[i].end = end; + + memmove(&memblk_nodeid[i + 1], &memblk_nodeid[i], + (num_node_memblks - i) * sizeof(*memblk_nodeid)); + memblk_nodeid[i] = node; + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { - __set_bit(num_node_memblks, memblk_hotplug); + next = true; if (end > mem_hotplug) mem_hotplug = end; } + for (; i <= num_node_memblks; ++i) { + bool prev = next; + + next = test_bit(i, memblk_hotplug); + if (prev) + __set_bit(i, memblk_hotplug); + else + __clear_bit(i, memblk_hotplug); + } + num_node_memblks++; } -- 2.30.2